Description

Create figures for the manuscript, extended abstract and poster.

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggplot2)
require(ggrepel)
## Loading required package: ggrepel
require(magrittr)
## Loading required package: magrittr
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following object is masked from 'package:igraph':
## 
##     crossing
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)

Load data

Un-normalized data:

input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data'
plots_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots'
numbers_file = paste(input_dir, 'dataset_numbers_complete_graph.txt', sep='/')
numbers_df = fread(numbers_file)
numbers_df$dataset = tolower(numbers_df$dataset)

input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, 'topology_results_pearson_pval_0.05.txt', sep='/')
results_selected_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, 'topology_results_mean_pearson_pval_0.05.txt', sep='/')
topology_results_selected_by_size_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, 'predictions_pearson_pval_0.05.txt', sep='/')
predicted_results_df = fread(predictions_file)
analytical_results_file = paste(input_dir, 'analytical_model_results_pearson_pval_0.05.txt', sep='/')
topology_results_selected_analytical_df = fread(analytical_results_file)
analytical_summary_file = paste(input_dir, 'analytical_model_summary_pearson_pval_0.05.txt', sep='/')
analytical_model_summary_df = fread(analytical_summary_file)
analytical_regression_results_file = paste(input_dir, 'analytical_model_regression_results_pearson_pval_0.05.txt', sep='/')
stretched_exponential_regression_df = fread(analytical_regression_results_file)
theoretical_sample_size_file = paste(input_dir, 'theoretical_sample_size_for_correlations_of_datasets.txt', sep='/')
sample_size_correlation_df = fread(theoretical_sample_size_file)

Normalized data:

topology_type_normalization = "divide.L"
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, '/', 'topology_results_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
results_selected_norm_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, '/', 'topology_results_mean_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_by_size_norm_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, '/', 'predictions_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
predicted_results_norm_df = fread(predictions_file)
analytical_results_file = paste(input_dir, '/', 'analytical_model_results_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_analytical_norm_df = fread(analytical_results_file)

Number of networks and maximum sample size:

nrow((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% unique()))
## [1] 2584
max(((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% unique()))$size)
## [1] 1100

Number of networks and maximum sample size without counting TCGA full dataset:

nrow((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% filter(!(type_dataset == "tcga:tcga")) %>% unique()))
## [1] 2584
max(((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% filter(!(type_dataset == "tcga:tcga")) %>% unique()))$size)
## [1] 1100

Make plots

Curve plot

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
results_selected_norm_df %>%
  filter(type_dataset %in% datasets_selected) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  ggplot(aes(x=size, y=unnorm, col=type_dataset)) +
  geom_point(alpha=0.5, size=3) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Num. samples") +
  #xlab("log(N)") +
  ylab("Num. significant correlations") +
  guides(col=guide_legend(title="Dataset")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "curve_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 10000,
  height = 6000,
  units = c("px")
)

Regression plot

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
stretched_exponential_regression_df %>% inner_join((topology_results_selected_analytical_df %>% dplyr::select(type_dataset, model, slope, intercept, a, b) %>% unique()), by=c("type_dataset", "model")) %>%
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  ggplot(aes(x=x, y=y, col=type_dataset)) +
  geom_point(alpha=0.5, size=3) +
  geom_abline(aes(intercept=intercept, slope=slope, col=type_dataset), size=1) +
  theme_linedraw() +
  xlab("log(N)") +
  ylab("log[ log(L) - log(S) ]") +
  scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  guides(col=guide_legend(title="Dataset")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "regression_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9500,
  height = 6000,
  units = c("px")
)
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
analytical_model_summary_df$L_vs_total = abs((analytical_model_summary_df$unnorm_L - analytical_model_summary_df$total_num_edges)) / analytical_model_summary_df$total_num_edges
analytical_model_summary_df %>% 
  dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>% 
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()
##                        type_dataset                                 model
## 1: scipher:scipher.complete.dataset Stretched exponential (by linear fit)
## 2:               gtex:artery.tibial Stretched exponential (by linear fit)
## 3:                 gtex:whole.blood Stretched exponential (by linear fit)
## 4:                   tcga:tcga-brca Stretched exponential (by linear fit)
## 5:                   tcga:tcga-coad Stretched exponential (by linear fit)
##           a         b        L L_vs_total adj.r.squared relative.error.mean
## 1: 1.790429  42.98998 1.462534          0     0.9868786          0.13920918
## 2: 1.927336 106.91242 1.228832          0     0.9436667          0.15898866
## 3: 1.650687  21.33857 1.586486          0     0.9832295          0.16228112
## 4: 1.486656  18.55161 3.565799          0     0.9975507          0.06784906
## 5: 1.809127  59.17297 1.770780          0     0.9924464          0.08729841

Prediction plot

#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
results_selected_norm_df %>% right_join((predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>% unique()), by=c("type_dataset", "size")) %>%
  filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
#results_selected_norm_df %>%
#  filter(type_dataset %in% datasets_selected) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  ggplot(aes(x=size, y=num_edges, col=type_dataset)) +
  geom_point(alpha=0.5, size=3) +
  #geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
  geom_line(aes(x=size, y=model_result, col=type_dataset), size=1) +
  scale_x_continuous(trans = scales::log10_trans()) +
  scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Num. samples (Log)") +
  ylab("Fraction of significant correlations") +
  guides(col=guide_legend(title="Dataset")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
## Warning: Removed 24825 rows containing missing values (geom_point).

plot_file = paste(plots_dir, "prediction_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9500,
  height = 6000,
  units = c("px")
)
## Warning: Removed 24825 rows containing missing values (geom_point).

Comparison of models: stretched exponential vs. logarithm

models_selected = c("Stretched exponential (by linear fit)", "Logarithmic", "Mean")
dataset_selected = "gtex:whole.blood"

# Join predicted results with mean
predicted_results_norm_and_mean_df = rbind((topology_results_selected_by_size_norm_df %>% dplyr::select(type_dataset, size, mean_norm) %>% unique() %>% rename("model_result" = "mean_norm") %>% mutate(model="Mean")), (predicted_results_norm_df %>% dplyr::select(type_dataset, size, model_result, model) %>% unique())) %>% 
  filter(model %in% models_selected) %>%
  mutate(model = replace(model, model == "Stretched exponential (by linear fit)", "Stretched exp.")) %>%
  mutate(model = replace(model, model == "Logarithmic", "Logarithm"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Stretched exp.", "Logarithm"))

# Process and plot
results_selected_norm_df %>% 
  right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
  filter(type_dataset == dataset_selected) %>%
  filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
  ggplot(aes(x=size, y=num_edges, col=model)) +
  geom_point(alpha=0.5, size=3, col="#91faad") +
  geom_line(aes(x=size, y=model_result, col=model), size=1) +
  scale_x_continuous(trans = scales::log10_trans()) +
  scale_color_manual(values = c("#00f549", "#02b0f5", "#fc9403")) +
  theme_linedraw() +
  xlab("Num. samples (Log)") +
  ylab("Fraction of significant correlations") +
  #guides(col=guide_legend(title="Model")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_blank(), legend.position="bottom")
## Warning: Removed 72 rows containing missing values (geom_point).

plot_file = paste(plots_dir, "comparison_models_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 6600,
  height = 6700,
  units = c("px")
)
## Warning: Removed 72 rows containing missing values (geom_point).

a vs. fraction of significant correlations plot

#'  calculate_predictions_using_stretched_exponential_model_optimized
#'  Formula to calculate exponential decay of significant interactions from a list of sample sizes.
#'  This formula requires the use of the L parameter
#'  @param x List of sample sizes.
#'  @param a Slope coefficient.
#'  @param b Intercept coefficient.
#'  
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
  y = L * exp((b*x**(-a+1))/(-a+1))
  #y = exp(log(L) - exp(b+a*log(x)))
  return(y)
}

#'  calculate_prediction_from_analytical_model
#'  Function to calculate predictions using a specific analytical model
#'  @param model Name of the model used.
#'  @param x_list List of values in the x axis (e.g. size)
#'  @param a Parameter a.
#'  @param b Parameter b.
#'  @param L Parameter L.
#'  
calculate_prediction_from_analytical_model = function(model, x_list, a, b, L){
  if(model == "Logarithmic"){
    prediction_result = (log(x_list)*a + b)
  } else if((model == "Stretched exponential (by optimization)") | (model == "Stretched exponential (by linear fit)")){
    prediction_result = calculate_predictions_using_stretched_exponential_model_optimized(x=x_list, L=L, a=a, b=b)
  } else if(model == "Stretched exponential (without L)"){
    prediction_result = calculate_predictions_using_stretched_exponential_model_without_L(x=x_list, a=a, b=b)
  }
  return(prediction_result)
}
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
sample_size_vs_a_df = analytical_model_summary_df %>% inner_join(sample_size_correlation_df, by=c("type_dataset"="dataset")) %>% 
  #filter((type_dataset %in% datasets_selected) & (model == model_selected)) 
  filter(model == model_selected)

# Calculate number of edges for each critical correlation using model
sample_size_vs_a_df$num_edges_from_statistical_corrected = calculate_prediction_from_analytical_model(model=model_selected, x_list=sample_size_vs_a_df$sample_size_statistical_corrected, a=sample_size_vs_a_df$a, b=sample_size_vs_a_df$b, L=sample_size_vs_a_df$L)
sample_size_vs_a_df$num_edges_from_statistical_corrected = sample_size_vs_a_df$num_edges_from_statistical_corrected * sample_size_vs_a_df$max_value_in_dataset
sample_size_vs_a_df$num_edges_from_statistical_corrected_norm = sample_size_vs_a_df$num_edges_from_statistical_corrected / sample_size_vs_a_df$total_num_edges
type_correlation_selected = "weak"
sample_size_vs_a_df %>%
  filter(type_correlation == type_correlation_selected) %>%
  arrange(desc(a))
##                        type_dataset                                 model
##  1:              gtex:artery.tibial Stretched exponential (by linear fit)
##  2: gtex:skin.sun.exposed.lower.leg Stretched exponential (by linear fit)
##  3:                  tcga:tcga-coad Stretched exponential (by linear fit)
##  4:                    gtex:thyroid Stretched exponential (by linear fit)
##  5:            gtex:muscle.skeletal Stretched exponential (by linear fit)
##  6:                  tcga:tcga-luad Stretched exponential (by linear fit)
##  7:                  tcga:tcga-ucec Stretched exponential (by linear fit)
##  8:                       gtex:lung Stretched exponential (by linear fit)
##  9:                gtex:whole.blood Stretched exponential (by linear fit)
## 10:                  tcga:tcga-thca Stretched exponential (by linear fit)
## 11:                  tcga:tcga-kirc Stretched exponential (by linear fit)
## 12:                  tcga:tcga-lusc Stretched exponential (by linear fit)
## 13:                   tcga:tcga-lgg Stretched exponential (by linear fit)
## 14:                  tcga:tcga-skcm Stretched exponential (by linear fit)
## 15:                  tcga:tcga-brca Stretched exponential (by linear fit)
##            a         b        L max_value_in_dataset
##  1: 1.927336 106.91242 1.228832            111967462
##  2: 1.836919  69.54141 1.443081            109822205
##  3: 1.809127  59.17297 1.770780             88426012
##  4: 1.746114  46.00874 1.669286             95645379
##  5: 1.722551  41.77794 1.598193             79087483
##  6: 1.656657  39.41186 2.709381             68040939
##  7: 1.654527  32.50631 2.274983             69647418
##  8: 1.654318  28.07262 1.964261             82744314
##  9: 1.650687  21.33857 1.586486             72312862
## 10: 1.592448  22.92988 2.677422             66112661
## 11: 1.575560  24.71561 3.405391             54688392
## 12: 1.543983  25.78059 4.921295             38777887
## 13: 1.528879  19.73056 3.959177             47356290
## 14: 1.503901  20.60499 6.322108             25885542
## 15: 1.486656  18.55161 3.565799             51204938
##                                                           formula adj.r.squared
##  1: F(x) = 1.23 * exp[(106.91 * x**((1.93) + 1)) / ((1.93) + 1) ]     0.9436667
##  2:  F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ]     0.9921006
##  3:  F(x) = 1.77 * exp[(59.17 * x**((1.81) + 1)) / ((1.81) + 1) ]     0.9924464
##  4:  F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ]     0.9898823
##  5:   F(x) = 1.6 * exp[(41.78 * x**((1.72) + 1)) / ((1.72) + 1) ]     0.9944041
##  6:  F(x) = 2.71 * exp[(39.41 * x**((1.66) + 1)) / ((1.66) + 1) ]     0.9974881
##  7:  F(x) = 2.27 * exp[(32.51 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9949493
##  8:  F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9838666
##  9:  F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ]     0.9832295
## 10:  F(x) = 2.68 * exp[(22.93 * x**((1.59) + 1)) / ((1.59) + 1) ]     0.9960270
## 11:  F(x) = 3.41 * exp[(24.72 * x**((1.58) + 1)) / ((1.58) + 1) ]     0.9944937
## 12:  F(x) = 4.92 * exp[(25.78 * x**((1.54) + 1)) / ((1.54) + 1) ]     0.9875920
## 13:  F(x) = 3.96 * exp[(19.73 * x**((1.53) + 1)) / ((1.53) + 1) ]     0.9979693
## 14:     F(x) = 6.32 * exp[(20.6 * x**((1.5) + 1)) / ((1.5) + 1) ]     0.9782098
## 15:  F(x) = 3.57 * exp[(18.55 * x**((1.49) + 1)) / ((1.49) + 1) ]     0.9975507
##     relative.error.mean  unnorm_L total_num_edges density L_vs_total
##  1:          0.15898866 137589166       137589166       1          0
##  2:          0.13088726 158482306       158482306       1          0
##  3:          0.08729841 156583056       156583056       1          0
##  4:          0.14868418 159659515       159659515       1          0
##  5:          0.07910128 126397050       126397050       1          0
##  6:          0.07100511 184348801       184348801       1          0
##  7:          0.08393989 158446701       158446701       1          0
##  8:          0.28316122 162531435       162531435       1          0
##  9:          0.16228112 114723378       114723378       1          0
## 10:          0.15280302 177011520       177011520       1          0
## 11:          0.11524896 186235350       186235350       1          0
## 12:          0.15592063 190837416       190837416       1          0
## 13:          0.07097702 187491930       187491930       1          0
## 14:          0.21145193 163651186       163651186       1          0
## 15:          0.06784906 182586495       182586495       1          0
##     type_correlation correlation sample_size_statistical
##  1:             weak         0.2                68.77302
##  2:             weak         0.2                68.77302
##  3:             weak         0.2                68.77302
##  4:             weak         0.2                68.77302
##  5:             weak         0.2                68.77302
##  6:             weak         0.2                68.77302
##  7:             weak         0.2                68.77302
##  8:             weak         0.2                68.77302
##  9:             weak         0.2                68.77302
## 10:             weak         0.2                68.77302
## 11:             weak         0.2                68.77302
## 12:             weak         0.2                68.77302
## 13:             weak         0.2                68.77302
## 14:             weak         0.2                68.77302
## 15:             weak         0.2                68.77302
##     sample_size_statistical_corrected num_edges_from_statistical_corrected
##  1:                          911.1627                            111794287
##  2:                          916.9078                            120302266
##  3:                          939.7179                            117463600
##  4:                          920.8103                            109314656
##  5:                          894.8027                             82560413
##  6:                          946.9803                             94655796
##  7:                          939.6221                             90275648
##  8:                          921.3918                             99277579
##  9:                          886.0931                             77194741
## 10:                          945.7492                             90762327
## 11:                          947.8885                             81129977
## 12:                          948.9429                             61156192
## 13:                          947.5851                             69369551
## 14:                          941.3899                             44707084
## 15:                          946.9359                             46983418
##     num_edges_from_statistical_corrected_norm
##  1:                                 0.8125225
##  2:                                 0.7590896
##  3:                                 0.7501680
##  4:                                 0.6846736
##  5:                                 0.6531831
##  6:                                 0.5134603
##  7:                                 0.5697540
##  8:                                 0.6108208
##  9:                                 0.6728772
## 10:                                 0.5127481
## 11:                                 0.4356315
## 12:                                 0.3204623
## 13:                                 0.3699869
## 14:                                 0.2731852
## 15:                                 0.2573214
type_correlation_selected = "weak"
#sample_size_vs_a_df$data_col = ifelse(sample_size_vs_a_df$type_dataset == "gtex:whole.blood", "#E69F00", ifelse(sample_size_vs_a_df$type_dataset == "gtex:artery.tibial", "#D55E00", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-coad", "#56B4E9", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-brca", "#0072B2", "black"))))

sample_size_vs_a_df %>% 
  filter(type_correlation == type_correlation_selected) %>% 
  filter(!(type_dataset == "tcga:tcga")) %>% 
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) + 
  geom_point() +
  geom_point(data = . %>% filter(type_dataset == "TCGA: Colon cancer"), color = "#56B4E9") +
  geom_point(data = . %>% filter(type_dataset == "TCGA: Breast cancer"), color = "#0072B2") +
  geom_point(data = . %>% filter(type_dataset == "R. arthritis"), color = "#44AA99") +
  geom_point(data = . %>% filter(type_dataset == "GTEx: Whole blood"), color = "#E69F00") +
  geom_point(data = . %>% filter(type_dataset == "GTEx: Artery tibial"), color = "#D55E00") +
  xlab(expression(alpha)) +
  ylab("Fraction of correlations above 0.2") +
  #guides(col=guide_legend(title="Dataset")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold")) +
  geom_text_repel(
    data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "gtex:whole.blood"),
    aes(label = type_dataset), col = "#E69F00",
    size = 5,
    box.padding = unit(0.5, "lines"),
    point.padding = unit(0.3, "lines")
  ) +
  geom_text_repel(
    data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "gtex:artery.tibial"),
    aes(label = type_dataset), col = "#D55E00",
    size = 5,
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.3, "lines")
  ) +
  geom_text_repel(
    data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected) %>% mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis"))), type_dataset == "R. arthritis"),
    aes(label = type_dataset), col = "#44AA99",
    size = 5,
    box.padding = unit(0.5, "lines"),
    point.padding = unit(0.3, "lines")
  ) +
  geom_text_repel(
    data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "tcga:tcga-brca"),
    aes(label = type_dataset), col = "#0072B2",
    size = 5,
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.3, "lines")
  ) +
  geom_text_repel(
    data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "tcga:tcga-coad"),
    aes(label = type_dataset), col = "#56B4E9",
    size = 5,
    box.padding = unit(0.5, "lines"),
    point.padding = unit(0.3, "lines")
  )

plot_file = paste(plots_dir, "a_vs_fraction_sig_correlations_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 6400,
  height = 6000,
  units = c("px")
)
type_correlation_selected = "weak"
#sample_size_vs_a_df$data_col = ifelse(sample_size_vs_a_df$type_dataset == "gtex:whole.blood", "#E69F00", ifelse(sample_size_vs_a_df$type_dataset == "gtex:artery.tibial", "#D55E00", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-coad", "#56B4E9", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-brca", "#0072B2", "black"))))

sample_size_vs_a_df %>% 
  filter(type_correlation == type_correlation_selected) %>% 
  filter(!(type_dataset == "tcga:tcga")) %>% 
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) + 
  geom_point() +
  geom_point(data = . %>% filter(type_dataset == "TCGA: Colon cancer"), color = "#56B4E9") +
  geom_point(data = . %>% filter(type_dataset == "TCGA: Breast cancer"), color = "#0072B2") +
  geom_point(data = . %>% filter(type_dataset == "R. arthritis"), color = "#44AA99") +
  geom_point(data = . %>% filter(type_dataset == "GTEx: Whole blood"), color = "#E69F00") +
  geom_point(data = . %>% filter(type_dataset == "GTEx: Artery tibial"), color = "#D55E00") +
  xlab(expression(alpha)) +
  ylab("Fraction of correlations above 0.2") +
  #guides(col=guide_legend(title="Dataset")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "a_vs_fraction_sig_correlations_pearson_pval_0.05_no_labels.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 6400,
  height = 6000,
  units = c("px")
)
#(sample_size_vs_a_df %>% select("type_dataset", "a") %>% unique()) %>% inner_join((results_df %>% select("type_dataset", "size") %>% unique()), by=c("type_dataset")) %>%
#  group_by(type_dataset, a) %>%
#  summarize(max_size = max(size)) %>%
#  ggplot(aes(x=a, y=max_size)) + 
#  geom_point()

Levels of correlations plot

Define function to calculate critical sample size for correlation:

calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization = function(r, total_num_edges, alpha=0.05, N_guess=c(10000)){
  optimize_N = function(par){
    N = par[1]
    t_guess = qt(alpha, df=N-2, lower.tail = F)
    t = r * sqrt((N-2)) / sqrt((1-r**2))
    return((abs(t-t_guess)))
  }
  res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
  return(res$par)
}

calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
  y = L * exp((b*x**(-a+1))/(-a+1))
  #y = exp(log(L) - exp(b+a*log(x)))
  return(y)
}

Calculate sample size for different levels of correlation and different datasets:

type_correlation_df = data.frame(type_correlation=c("weak", "moderate", "strong", "very strong"), lower_val=c(0.2,0.4,0.6,0.8), upper_val=c(0.4,0.6,0.8,NA))
types_correlation = c("weak", "moderate", "strong", "very strong")

cols = c("dataset", "type_correlation", "correlation", "sample_size_statistical", "sample_size_statistical_corrected")
sample_size_correlation_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset in numbers_df$dataset){
  total_num_edges = (numbers_df %>% filter(dataset == !!dataset))$total_num_edges
  for (type_correlation in types_correlation){
    r = (type_correlation_df %>% filter(type_correlation == !!type_correlation))$lower_val
    N = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05, N_guess=c(10000))
    N_corrected = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05/total_num_edges, N_guess=c(10000))
    sample_size_correlation_df <- rbind(sample_size_correlation_df, data.frame(dataset=dataset, type_correlation=type_correlation, correlation=r, sample_size_statistical=N, sample_size_statistical_corrected=N_corrected))
  }
}

print(sample_size_correlation_df)
##                                         dataset type_correlation correlation
## 1              scipher:scipher.complete.dataset             weak         0.2
## 2              scipher:scipher.complete.dataset         moderate         0.4
## 3              scipher:scipher.complete.dataset           strong         0.6
## 4              scipher:scipher.complete.dataset      very strong         0.8
## 5         scipher:scipher.complete.nonresponder             weak         0.2
## 6         scipher:scipher.complete.nonresponder         moderate         0.4
## 7         scipher:scipher.complete.nonresponder           strong         0.6
## 8         scipher:scipher.complete.nonresponder      very strong         0.8
## 9            scipher:scipher.complete.responder             weak         0.2
## 10           scipher:scipher.complete.responder         moderate         0.4
## 11           scipher:scipher.complete.responder           strong         0.6
## 12           scipher:scipher.complete.responder      very strong         0.8
## 13  scipher:scipher.sample.per.patient.baseline             weak         0.2
## 14  scipher:scipher.sample.per.patient.baseline         moderate         0.4
## 15  scipher:scipher.sample.per.patient.baseline           strong         0.6
## 16  scipher:scipher.sample.per.patient.baseline      very strong         0.8
## 17                               tcga:tcga-brca             weak         0.2
## 18                               tcga:tcga-brca         moderate         0.4
## 19                               tcga:tcga-brca           strong         0.6
## 20                               tcga:tcga-brca      very strong         0.8
## 21                        tcga:tcga-brca-normal             weak         0.2
## 22                        tcga:tcga-brca-normal         moderate         0.4
## 23                        tcga:tcga-brca-normal           strong         0.6
## 24                        tcga:tcga-brca-normal      very strong         0.8
## 25                               tcga:tcga-coad             weak         0.2
## 26                               tcga:tcga-coad         moderate         0.4
## 27                               tcga:tcga-coad           strong         0.6
## 28                               tcga:tcga-coad      very strong         0.8
## 29                        tcga:tcga-coad-normal             weak         0.2
## 30                        tcga:tcga-coad-normal         moderate         0.4
## 31                        tcga:tcga-coad-normal           strong         0.6
## 32                        tcga:tcga-coad-normal      very strong         0.8
## 33                               tcga:tcga-kirc             weak         0.2
## 34                               tcga:tcga-kirc         moderate         0.4
## 35                               tcga:tcga-kirc           strong         0.6
## 36                               tcga:tcga-kirc      very strong         0.8
## 37                        tcga:tcga-kirc-normal             weak         0.2
## 38                        tcga:tcga-kirc-normal         moderate         0.4
## 39                        tcga:tcga-kirc-normal           strong         0.6
## 40                        tcga:tcga-kirc-normal      very strong         0.8
## 41                                tcga:tcga-lgg             weak         0.2
## 42                                tcga:tcga-lgg         moderate         0.4
## 43                                tcga:tcga-lgg           strong         0.6
## 44                                tcga:tcga-lgg      very strong         0.8
## 45                               tcga:tcga-luad             weak         0.2
## 46                               tcga:tcga-luad         moderate         0.4
## 47                               tcga:tcga-luad           strong         0.6
## 48                               tcga:tcga-luad      very strong         0.8
## 49                        tcga:tcga-luad-normal             weak         0.2
## 50                        tcga:tcga-luad-normal         moderate         0.4
## 51                        tcga:tcga-luad-normal           strong         0.6
## 52                        tcga:tcga-luad-normal      very strong         0.8
## 53                               tcga:tcga-lusc             weak         0.2
## 54                               tcga:tcga-lusc         moderate         0.4
## 55                               tcga:tcga-lusc           strong         0.6
## 56                               tcga:tcga-lusc      very strong         0.8
## 57                        tcga:tcga-lusc-normal             weak         0.2
## 58                        tcga:tcga-lusc-normal         moderate         0.4
## 59                        tcga:tcga-lusc-normal           strong         0.6
## 60                        tcga:tcga-lusc-normal      very strong         0.8
## 61                               tcga:tcga-skcm             weak         0.2
## 62                               tcga:tcga-skcm         moderate         0.4
## 63                               tcga:tcga-skcm           strong         0.6
## 64                               tcga:tcga-skcm      very strong         0.8
## 65                               tcga:tcga-thca             weak         0.2
## 66                               tcga:tcga-thca         moderate         0.4
## 67                               tcga:tcga-thca           strong         0.6
## 68                               tcga:tcga-thca      very strong         0.8
## 69                        tcga:tcga-thca-normal             weak         0.2
## 70                        tcga:tcga-thca-normal         moderate         0.4
## 71                        tcga:tcga-thca-normal           strong         0.6
## 72                        tcga:tcga-thca-normal      very strong         0.8
## 73                               tcga:tcga-ucec             weak         0.2
## 74                               tcga:tcga-ucec         moderate         0.4
## 75                               tcga:tcga-ucec           strong         0.6
## 76                               tcga:tcga-ucec      very strong         0.8
## 77                        tcga:tcga-ucec-normal             weak         0.2
## 78                        tcga:tcga-ucec-normal         moderate         0.4
## 79                        tcga:tcga-ucec-normal           strong         0.6
## 80                        tcga:tcga-ucec-normal      very strong         0.8
## 81                           gtex:artery.tibial             weak         0.2
## 82                           gtex:artery.tibial         moderate         0.4
## 83                           gtex:artery.tibial           strong         0.6
## 84                           gtex:artery.tibial      very strong         0.8
## 85                   gtex:breast.mammary.tissue             weak         0.2
## 86                   gtex:breast.mammary.tissue         moderate         0.4
## 87                   gtex:breast.mammary.tissue           strong         0.6
## 88                   gtex:breast.mammary.tissue      very strong         0.8
## 89                                    gtex:lung             weak         0.2
## 90                                    gtex:lung         moderate         0.4
## 91                                    gtex:lung           strong         0.6
## 92                                    gtex:lung      very strong         0.8
## 93                         gtex:muscle.skeletal             weak         0.2
## 94                         gtex:muscle.skeletal         moderate         0.4
## 95                         gtex:muscle.skeletal           strong         0.6
## 96                         gtex:muscle.skeletal      very strong         0.8
## 97              gtex:skin.sun.exposed.lower.leg             weak         0.2
## 98              gtex:skin.sun.exposed.lower.leg         moderate         0.4
## 99              gtex:skin.sun.exposed.lower.leg           strong         0.6
## 100             gtex:skin.sun.exposed.lower.leg      very strong         0.8
## 101                                gtex:thyroid             weak         0.2
## 102                                gtex:thyroid         moderate         0.4
## 103                                gtex:thyroid           strong         0.6
## 104                                gtex:thyroid      very strong         0.8
## 105                            gtex:whole.blood             weak         0.2
## 106                            gtex:whole.blood         moderate         0.4
## 107                            gtex:whole.blood           strong         0.6
## 108                            gtex:whole.blood      very strong         0.8
##     sample_size_statistical sample_size_statistical_corrected
## 1                 68.773025                         914.63989
## 2                 18.002295                         216.05522
## 3                  8.523611                          85.91378
## 4                  5.063346                          38.90078
## 5                 68.773025                         915.15007
## 6                 18.002295                         216.17467
## 7                  8.523611                          85.96045
## 8                  5.063346                          38.92117
## 9                 68.773025                         914.79186
## 10                18.002295                         216.09080
## 11                 8.523611                          85.92768
## 12                 5.063346                          38.90685
## 13                68.773025                         914.18952
## 14                18.002295                         215.94977
## 15                 8.523611                          85.87258
## 16                 5.063346                          38.88278
## 17                68.773025                         945.60421
## 18                18.002295                         223.30503
## 19                 8.523611                          88.74614
## 20                 5.063346                          40.13813
## 21                68.773025                         948.92354
## 22                18.002295                         224.08220
## 23                 8.523611                          89.04977
## 24                 5.063346                          40.27077
## 25                68.773025                         938.25966
## 26                18.002295                         221.58542
## 27                 8.523611                          88.07432
## 28                 5.063346                          39.84464
## 29                68.773025                         944.05564
## 30                18.002295                         222.94246
## 31                 8.523611                          88.60449
## 32                 5.063346                          40.07625
## 33                68.773025                         946.55023
## 34                18.002295                         223.52652
## 35                 8.523611                          88.83268
## 36                 5.063346                          40.17593
## 37                68.773025                         946.85197
## 38                18.002295                         223.59717
## 39                 8.523611                          88.86028
## 40                 5.063346                          40.18799
## 41                68.773025                         946.87172
## 42                18.002295                         223.60180
## 43                 8.523611                          88.86208
## 44                 5.063346                          40.18878
## 45                68.773025                         946.06344
## 46                18.002295                         223.41255
## 47                 8.523611                          88.78815
## 48                 5.063346                          40.15648
## 49                68.773025                         942.86340
## 50                18.002295                         222.66332
## 51                 8.523611                          88.49544
## 52                 5.063346                          40.02860
## 53                68.773025                         947.71732
## 54                18.002295                         223.79978
## 55                 8.523611                          88.93943
## 56                 5.063346                          40.22257
## 57                68.773025                         948.72516
## 58                18.002295                         224.03575
## 59                 8.523611                          89.03162
## 60                 5.063346                          40.26284
## 61                68.773025                         940.37012
## 62                18.002295                         222.07955
## 63                 8.523611                          88.26737
## 64                 5.063346                          39.92897
## 65                68.773025                         944.12173
## 66                18.002295                         222.95793
## 67                 8.523611                          88.61054
## 68                 5.063346                          40.07889
## 69                68.773025                         947.49190
## 70                18.002295                         223.74700
## 71                 8.523611                          88.91881
## 72                 5.063346                          40.21356
## 73                68.773025                         938.82521
## 74                18.002295                         221.71783
## 75                 8.523611                          88.12605
## 76                 5.063346                          39.86724
## 77                68.773025                         942.98176
## 78                18.002295                         222.69102
## 79                 8.523611                          88.50626
## 80                 5.063346                          40.03333
## 81                68.773025                         932.07888
## 82                18.002295                         220.13829
## 83                 8.523611                          87.50896
## 84                 5.063346                          39.59765
## 85                68.773025                         938.69085
## 86                18.002295                         221.68638
## 87                 8.523611                          88.11376
## 88                 5.063346                          39.86187
## 89                68.773025                         940.04191
## 90                18.002295                         222.00271
## 91                 8.523611                          88.23735
## 92                 5.063346                          39.91586
## 93                68.773025                         928.02415
## 94                18.002295                         219.18894
## 95                 8.523611                          87.13806
## 96                 5.063346                          39.43562
## 97                68.773025                         938.83594
## 98                18.002295                         221.72035
## 99                 8.523611                          88.12704
## 100                5.063346                          39.86767
## 101               68.773025                         939.18970
## 102               18.002295                         221.80317
## 103                8.523611                          88.15940
## 104                5.063346                          39.88180
## 105               68.773025                         923.39357
## 106               18.002295                         218.10476
## 107                8.523611                          86.71449
## 108                5.063346                          39.25058
dataset_selected = "gtex:whole.blood"
model_selected = "Stretched exponential (by linear fit)"
results_analytical_gtex_wb = topology_results_selected_analytical_norm_df %>% 
  filter((model == model_selected) & (type_dataset == dataset_selected)) %>%
  select(model, type_dataset, a, b, L, max_value_in_dataset) %>% unique()
sample_size_correlation_gtex_wb = sample_size_correlation_df %>%
  filter((dataset == dataset_selected) & (!(type_correlation == "weak")))

predictions_convergence_gtex_wb_norm = calculate_predictions_using_stretched_exponential_model_optimized(x=sample_size_correlation_gtex_wb$sample_size_statistical_corrected, L=results_analytical_gtex_wb$L, a=results_analytical_gtex_wb$a, b=results_analytical_gtex_wb$b)
predictions_convergence_gtex_wb_df = data.frame(size=round(sample_size_correlation_gtex_wb$sample_size_statistical_corrected), num_edges=predictions_convergence_gtex_wb_norm * results_analytical_gtex_wb$max_value_in_dataset, num_edges_norm=predictions_convergence_gtex_wb_norm)
predictions_convergence_gtex_wb_df$num_edges_norm = (predictions_convergence_gtex_wb_df$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
predictions_convergence_gtex_wb_df
##   size num_edges num_edges_norm
## 1  218  42782199     0.37291614
## 2   87  19008527     0.16569009
## 3   39   5649757     0.04924678
predicted_results_wb_df = predicted_results_df %>% filter((model == model_selected) & (type_dataset == dataset_selected))
predicted_results_wb_df$model_result_norm = (predicted_results_wb_df$model_result/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
dataset_selected = "Whole.Blood"
all_topology_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_topology.csv'
threshold_selected = 0.05

all_results_df = fread(all_topology_results_file)
results_gtex_wb = all_results_df %>% 
  filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong")))
results_gtex_wb$num_edges_norm = (results_gtex_wb$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L

results_gtex_wb %>% 
  mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
  ggplot(aes(x=size, y=num_edges_norm, col=type_correlation)) +
  geom_point(alpha=0.5, size=3) +
  geom_line(data=(predicted_results_wb_df %>% filter(size <= max((all_results_df %>% filter((type_dataset == dataset_selected) & (threshold == threshold_selected)))$size))), aes(x=size, y=model_result_norm), size=1, color='red') +
  geom_point(data=predictions_convergence_gtex_wb_df, 
           aes(x=size,y=num_edges_norm), 
           color='red',
           size=3) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Number of samples") +
  #xlab("log(N)") +
  ylab("Fraction of significant correlations") +
  guides(col=guide_legend(title="Correlation level")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "correlation_levels_pearson_pval_0.05_data_gtexwholeblood.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

Disease modules plot

all_disease_genes_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_disease_genes.csv'
threshold_selected = 0.05
dataset_selected = "scipher:scipher.complete.dataset"

all_disease_genes_results_df = fread(all_disease_genes_results_file)
all_disease_genes_results_df$type_dataset = paste(all_disease_genes_results_df$dataset, all_disease_genes_results_df$type_dataset, sep=":") # Join dataset and type_dataset
all_disease_genes_results_df$type_dataset = tolower(all_disease_genes_results_df$type_dataset)

results_scipher_df = all_disease_genes_results_df %>%
  filter((type_dataset == dataset_selected) & (threshold == threshold_selected))
head(results_scipher_df)
##     method dataset                     type_dataset type_tcga_tissue size rep
## 1: pearson scipher scipher:scipher.complete.dataset                   100   1
## 2: pearson scipher scipher:scipher.complete.dataset                   100   1
## 3: pearson scipher scipher:scipher.complete.dataset                   100   1
## 4: pearson scipher scipher:scipher.complete.dataset                   100   1
## 5: pearson scipher scipher:scipher.complete.dataset                   100   1
## 6: pearson scipher scipher:scipher.complete.dataset                   100   1
##    type_correlation threshold                  disease
## 1:              all      0.05        alzheimer disease
## 2:              all      0.05        alzheimer disease
## 3:              all      0.05            heart failure
## 4:              all      0.05 diabetes mellitus type 2
## 5:              all      0.05 diabetes mellitus type 2
## 6:              all      0.05                   asthma
##                         disease_class num_disease_genes num_disease_edges
## 1:                   mental disorders               226              5637
## 2:            nervous system diseases               226              5637
## 3:            cardiovascular diseases               104              1021
## 4:          endocrine system diseases               168              3361
## 5: nutritional and metabolic diseases               168              3361
## 6:             immune system diseases               279              9087
##    fraction_disease_genes num_disease_components num_disease_lcc_nodes
## 1:              0.8828125                     21                   224
## 2:              0.8828125                     21                   224
## 3:              0.8524590                     13                   104
## 4:              0.8795812                     14                   168
## 5:              0.8795812                     14                   168
## 6:              0.8971061                     23                   279
##    num_disease_lcc_edges fraction_disease_lcc_nodes disease_lcc_z
## 1:                  5636                  0.8750000      3.808188
## 2:                  5636                  0.8750000      3.808188
## 3:                  1021                  0.8524590      3.011924
## 4:                  3361                  0.8795812      3.695956
## 5:                  3361                  0.8795812      3.695956
## 6:                  9087                  0.8971061      4.232955
##    disease_lcc_pvalue
## 1:       0.000000e+00
## 2:       0.000000e+00
## 3:       7.076959e-07
## 4:       0.000000e+00
## 5:       0.000000e+00
## 6:       0.000000e+00
#cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
diseases_selected = c("arthritis rheumatoid", "asthma", "breast neoplasms", "heart failure")
results_scipher_df %>%
  filter((disease %in% diseases_selected) & (type_correlation == "moderate-strong-very strong")) %>%
  group_by(size, disease) %>%
  summarize(mean_pval = mean(disease_lcc_pvalue)) %>%
  ggplot(aes(x=size, y=abs(log10(mean_pval)), col=disease)) +
  #geom_point(alpha=0.5, size=3) +
  geom_line(size=1) +
  geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = cbbPalette) +
  theme_linedraw() +
  xlab("Number of samples") +
  #xlab("log(N)") +
  ylab("Disease LCC log p-value (abs)") +
  guides(col=guide_legend(title="Disease")) +
  theme(plot.title =  element_text(size = 17, face="bold"), 
        axis.title = element_text(size = 16, face="bold"), 
        axis.text = element_text(size = 15), 
        #legend.position = "bottom",
        legend.text = element_text(size = 14), 
        legend.title=element_text(size=15, face="bold"))
## `summarise()` has grouped output by 'size'. You can override using the `.groups`
## argument.

plot_file = paste(plots_dir, "disease_module_significance_by_diseases_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9700,
  height = 6000,
  units = c("px")
)
results_scipher_df %>%
  filter(disease == "arthritis rheumatoid") %>%
  filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong"))) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
  group_by(size, type_correlation) %>%
  mutate(mean_pval = mean(disease_lcc_pvalue)) %>%
  ungroup() %>%
  ggplot() +
  geom_point(aes(x=size, y=abs(log10(disease_lcc_pvalue)), col=type_correlation), alpha=0.4, size=3) +
  geom_line(aes(x=size, y=abs(log10(mean_pval)), col=type_correlation), size=1) +
  geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Number of samples") +
  #xlab("log(N)") +
  ylab("Disease LCC log p-value (abs)") +
  guides(col=guide_legend(title="Correlation level")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "disease_module_significance_by_correlations_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)
results_scipher_df %>%
  filter(disease == "arthritis rheumatoid") %>%
  filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong"))) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
  mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
  group_by(size, type_correlation) %>%
  mutate(mean_rlcc = mean(fraction_disease_lcc_nodes)) %>%
  ungroup() %>%
  ggplot() +
  geom_point(aes(x=size, y=fraction_disease_lcc_nodes, col=type_correlation), alpha=0.4, size=3) +
  geom_line(aes(x=size, y=mean_rlcc, col=type_correlation), size=1) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Number of samples") +
  #xlab("log(N)") +
  ylab("Fraction of disease genes in the LCC") +
  guides(col=guide_legend(title="Correlation level")) +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "disease_module_rlcc_by_correlations_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9300,
  height = 6000,
  units = c("px")
)
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
all_disease_genes_results_df %>%
  filter((disease == "arthritis rheumatoid") & (type_dataset %in% datasets_selected) & (threshold == threshold_selected) & (type_correlation == "moderate-strong-very strong")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
  mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
  group_by(size, type_dataset) %>%
  mutate(mean_pval = mean(disease_lcc_pvalue)) %>%
  ungroup() %>%
  ggplot() +
  geom_point(aes(x=size, y=abs(log10(disease_lcc_pvalue)), col=type_dataset), alpha=0.5, size=3) +
  #geom_line(aes(x=size, y=abs(log10(mean_pval)), col=type_dataset), size=1) +
  geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
  #scale_x_continuous(trans = scales::log10_trans()) +
  scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
  theme_linedraw() +
  xlab("Number of samples") +
  #xlab("log(N)") +
  ylab("Disease LCC log p-value (abs)") +
  guides(col=guide_legend(title="Dataset")) +
  theme(plot.title =  element_text(size = 17, face="bold"), 
        axis.title = element_text(size = 16, face="bold"), 
        axis.text = element_text(size = 15), 
        legend.text = element_text(size = 14), 
        legend.title=element_text(size=15, face="bold"))

plot_file = paste(plots_dir, "disease_module_significance_by_dataset_pearson_pval_0.05.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9200,
  height = 6000,
  units = c("px")
)